This module is to do exploratory data analysis and preprocess train and test dataset for home credit risk
import datetime
print(datetime.datetime.now())
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn.metrics
# Imputers to deal with Missing Data
from sklearn.impute import SimpleImputer
df_init = pd.read_csv(r"C:\Users\mamta\MMAI 2020\MMAI823_AI in Finance\Team Assignments\Team Project\Home Credit Risk Dataset\application_train.csv",sep=',')
list(df_init)
df_init.describe().transpose()
df_init.isnull().sum()
def missing_zero_values_table(df_init):
zero_val = (df_init == 0.00).astype(int).sum(axis=0)
mis_val = df_init.isnull().sum()
mis_val_percent = 100 * df_init.isnull().sum() / len(df_init)
mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)
mz_table = mz_table.rename(
columns = {0 : 'Zero Values', 1 : 'Missing Values', 2 : '% of Total Values'})
mz_table['Total Zero Missing Values'] = mz_table['Zero Values'] + mz_table['Missing Values']
mz_table['% Total Zero Missing Values'] = 100 * mz_table['Total Zero Missing Values'] / len(df_init)
mz_table['Data Type'] = df_init.dtypes
mz_table = mz_table[
mz_table.iloc[:,1] != 0].sort_values(
'% of Total Values', ascending=False).round(1)
print ("Your selected dataframe has " + str(df_init.shape[1]) + " columns and " + str(df_init.shape[0]) + " Rows.\n"
"There are " + str(mz_table.shape[0]) +
" columns that have missing values.")
# mz_table.to_excel('missing_and_zero_values.xlsx', freeze_panes=(1,0), index = False)
return mz_table
missing_zero_values_table(df_init)
import seaborn as sns
mpl.rcParams['interactive'] == True
#Using Pearson Correlation
plt.figure(figsize=(24,20))
cor = df_init.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.savefig('Correlation_Pearson')
plt.show()
# Find correlations with the target and sort
correlations = df_init.corr()['TARGET'].sort_values()
# Display correlations
print('Most Positive Correlations:\n', correlations.tail(15))
print('\nMost Negative Correlations:\n', correlations.head(15))
df_init['DAYS_BIRTH_YEAR'] = round((df_init['DAYS_BIRTH'] / -365),2 )
df_init['DAYS_EMPLOYED_YEAR'] = round((df_init['DAYS_EMPLOYED'] / -365),2 )
df_init['DAYS_REGISTRATION_YEAR'] = round((df_init['DAYS_REGISTRATION'] / -365),2 )
df_init['DAYS_ID_PUBLISH_YEAR'] = round((df_init['DAYS_ID_PUBLISH'] / -365),2)
df_init = df_init.drop(["DAYS_BIRTH", "DAYS_EMPLOYED", "DAYS_REGISTRATION","DAYS_ID_PUBLISH"], axis=1)
df_init = df_init.drop(["FLAG_DOCUMENT_2","FLAG_DOCUMENT_3","FLAG_DOCUMENT_4","FLAG_DOCUMENT_5","FLAG_DOCUMENT_6","FLAG_DOCUMENT_7",
"FLAG_DOCUMENT_8","FLAG_DOCUMENT_9","FLAG_DOCUMENT_10","FLAG_DOCUMENT_11","FLAG_DOCUMENT_12","FLAG_DOCUMENT_13","FLAG_DOCUMENT_14",
"FLAG_DOCUMENT_15","FLAG_DOCUMENT_16","FLAG_DOCUMENT_17","FLAG_DOCUMENT_18","FLAG_DOCUMENT_19","FLAG_DOCUMENT_20","FLAG_DOCUMENT_21"], axis=1)
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
temp_children = df_init["CNT_CHILDREN"].value_counts()
df_children = pd.DataFrame({'CNT_CHILDREN': temp_children.index,'values': temp_children.values})
trace = go.Bar(
x = df_children['CNT_CHILDREN'],y = df_children['values'],
name="Number of Children",
marker=dict(color="Blue"),
text=df_children['values']
)
data = [trace]
layout = dict(title = 'Number of Chilren',
xaxis = dict(title = 'Children_Count', showticklabels=True),
yaxis = dict(title = 'Record Count'),
hovermode = 'closest',width=600
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='Children_Count')
df_children
df_init['CNT_CHILDREN'].values[df_init['CNT_CHILDREN'].values > 5] = 6
temp_family_members = df_init["CNT_FAM_MEMBERS"].value_counts()
df_family_members = pd.DataFrame({'CNT_FAM_MEMBERS': temp_family_members.index,'values': temp_family_members.values})
trace = go.Bar(
x = df_family_members['CNT_FAM_MEMBERS'],y = df_family_members['values'],
name="Number of Family Members",
marker=dict(color="Blue"),
text=df_family_members['values']
)
data = [trace]
layout = dict(title = 'Number of Family Members',
xaxis = dict(title = 'Family_Members_Count', showticklabels=True),
yaxis = dict(title = 'Record Count'),
hovermode = 'closest',width=600
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='Family_Members_Count')
df_family_members
np.where(pd.isnull(df_init['CNT_FAM_MEMBERS']))
df_init['CNT_FAM_MEMBERS'].fillna(0, inplace=True)
df_init['CNT_FAM_MEMBERS'] = df_init['CNT_FAM_MEMBERS'].astype('int',copy=False)
df_init['CNT_FAM_MEMBERS'].values[df_init['CNT_FAM_MEMBERS'].values > 6] = 7
df_init['CNT_FAM_MEMBERS'].unique()
temp_REGION_RATING_CLIENT = df_init["REGION_RATING_CLIENT"].value_counts()
df_REGION_RATING_CLIENT = pd.DataFrame({'REGION_RATING_CLIENT': temp_REGION_RATING_CLIENT.index,'values': temp_REGION_RATING_CLIENT.values})
trace = go.Bar(
x = df_REGION_RATING_CLIENT['REGION_RATING_CLIENT'],y = df_REGION_RATING_CLIENT['values'],
name="Number of Region Rating Client",
marker=dict(color="Blue"),
text=df_REGION_RATING_CLIENT['values']
)
data = [trace]
layout = dict(title = 'Number of Region Rating Client',
xaxis = dict(title = 'REGION_RATING_CLIENT', showticklabels=True),
yaxis = dict(title = 'Record Count'),
hovermode = 'closest',width=600
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='REGION_RATING_CLIENT')
np.where(pd.isnull(df_init['REGION_RATING_CLIENT']))
df_init['REGION_RATING_CLIENT'].unique()
from sklearn.preprocessing import LabelEncoder
le_REGION_RATING_CLIENT = LabelEncoder()
df_init['REGION_RATING_CLIENT_ENCODED'] = le_REGION_RATING_CLIENT.fit_transform(df_init.REGION_RATING_CLIENT)
from sklearn.preprocessing import OneHotEncoder
REGION_RATING_CLIENT_ohe = OneHotEncoder()
X = REGION_RATING_CLIENT_ohe.fit_transform(df_init.REGION_RATING_CLIENT_ENCODED.values.reshape(-1,1)).toarray()
dfOneHot = pd.DataFrame(X, columns = ["REGION_RATING_CLIENT_"+str(int(i)) for i in range(X.shape[1])])
df_init = pd.concat([df_init, dfOneHot], axis=1)
df_init
df_init = df_init.drop(["REGION_RATING_CLIENT_ENCODED", "REGION_RATING_CLIENT"], axis=1)
temp_REGION_RATING_CLIENT_W_CITY = df_init["REGION_RATING_CLIENT_W_CITY"].value_counts()
df_REGION_RATING_CLIENT_W_CITY = pd.DataFrame({'REGION_RATING_CLIENT_W_CITY': temp_REGION_RATING_CLIENT_W_CITY.index,'values': temp_REGION_RATING_CLIENT_W_CITY.values})
trace = go.Bar(
x = df_REGION_RATING_CLIENT_W_CITY['REGION_RATING_CLIENT_W_CITY'],y = df_REGION_RATING_CLIENT_W_CITY['values'],
name="Number of Region Rating Client with City",
marker=dict(color="Blue"),
text=df_REGION_RATING_CLIENT_W_CITY['values']
)
data = [trace]
layout = dict(title = 'Number of Region Rating Client with City',
xaxis = dict(title = 'REGION_RATING_CLIENT_W_CITY', showticklabels=True),
yaxis = dict(title = 'Record Count'),
hovermode = 'closest',width=600
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='REGION_RATING_CLIENT_W_CITY')
np.where(pd.isnull(df_init['REGION_RATING_CLIENT_W_CITY']))
le_REGION_RATING_CLIENT_W_CITY = LabelEncoder()
df_init['REGION_RATING_CLIENT_W_CITY_ENCODED'] = le_REGION_RATING_CLIENT_W_CITY.fit_transform(df_init.REGION_RATING_CLIENT_W_CITY)
REGION_RATING_CLIENT_W_CITY_ohe = OneHotEncoder()
X = REGION_RATING_CLIENT_W_CITY_ohe.fit_transform(df_init.REGION_RATING_CLIENT_W_CITY_ENCODED.values.reshape(-1,1)).toarray()
dfOneHot = pd.DataFrame(X, columns = ["REGION_RATING_CLIENT_W_CITY_"+str(int(i)) for i in range(X.shape[1])])
df_init = pd.concat([df_init, dfOneHot], axis=1)
df_init
df_init = df_init.drop(["REGION_RATING_CLIENT_W_CITY_ENCODED", "REGION_RATING_CLIENT_W_CITY"], axis=1)
temp_WEEKDAY_APPR_PROCESS_START = df_init["WEEKDAY_APPR_PROCESS_START"].value_counts()
df_WEEKDAY_APPR_PROCESS_START = pd.DataFrame({'WEEKDAY_APPR_PROCESS_START': temp_WEEKDAY_APPR_PROCESS_START.index,'values': temp_WEEKDAY_APPR_PROCESS_START.values})
trace = go.Bar(
x = df_WEEKDAY_APPR_PROCESS_START['WEEKDAY_APPR_PROCESS_START'],y = df_WEEKDAY_APPR_PROCESS_START['values'],
name="Weekday Appraisal process start",
marker=dict(color="Blue"),
text=df_WEEKDAY_APPR_PROCESS_START['values']
)
data = [trace]
layout = dict(title = 'Weekday Appraisal process start',
xaxis = dict(title = 'df_WEEKDAY_APPR_PROCESS_START', showticklabels=True),
yaxis = dict(title = 'Record Count'),
hovermode = 'closest',width=600
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='df_WEEKDAY_APPR_PROCESS_START')
np.where(pd.isnull(df_init['WEEKDAY_APPR_PROCESS_START']))
le_WEEKDAY_APPR_PROCESS_START = LabelEncoder()
df_init['WEEKDAY_APPR_PROCESS_START_ENCODED'] = le_WEEKDAY_APPR_PROCESS_START.fit_transform(df_init.WEEKDAY_APPR_PROCESS_START)
WEEKDAY_APPR_PROCESS_START_ohe = OneHotEncoder()
X = WEEKDAY_APPR_PROCESS_START_ohe.fit_transform(df_init.WEEKDAY_APPR_PROCESS_START_ENCODED.values.reshape(-1,1)).toarray()
dfOneHot = pd.DataFrame(X, columns = ["WEEKDAY_APPR_PROCESS_START_"+str(int(i)) for i in range(X.shape[1])])
df_init = pd.concat([df_init, dfOneHot], axis=1)
df_init
df_init = df_init.drop(["WEEKDAY_APPR_PROCESS_START_ENCODED", "WEEKDAY_APPR_PROCESS_START"], axis=1)
temp_HOUR_APPR_PROCESS_START = df_init["HOUR_APPR_PROCESS_START"].value_counts()
df_HOUR_APPR_PROCESS_START = pd.DataFrame({'HOUR_APPR_PROCESS_START': temp_HOUR_APPR_PROCESS_START.index,'values': temp_HOUR_APPR_PROCESS_START.values})
trace = go.Bar(
x = df_HOUR_APPR_PROCESS_START['HOUR_APPR_PROCESS_START'],y = df_HOUR_APPR_PROCESS_START['values'],
name="Hour Appraisal process start",
marker=dict(color="Blue"),
text=df_HOUR_APPR_PROCESS_START['values']
)
data = [trace]
layout = dict(title = 'Hour Appraisal process start',
xaxis = dict(title = 'df_HOUR_APPR_PROCESS_START', showticklabels=True),
yaxis = dict(title = 'Record Count'),
hovermode = 'closest',width=600
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='df_HOUR_APPR_PROCESS_START')
df_init = df_init.drop(["HOUR_APPR_PROCESS_START"], axis=1)
temp_ORGANIZATION_TYPE = df_init["ORGANIZATION_TYPE"].value_counts()
df_ORGANIZATION_TYPE = pd.DataFrame({'ORGANIZATION_TYPE': temp_ORGANIZATION_TYPE.index,'values': temp_ORGANIZATION_TYPE.values})
trace = go.Bar(
x = df_ORGANIZATION_TYPE['ORGANIZATION_TYPE'],y = df_ORGANIZATION_TYPE['values'],
name="Organization Types",
marker=dict(color="Blue"),
text=df_ORGANIZATION_TYPE['values']
)
data = [trace]
layout = dict(title = 'Organization Types',
xaxis = dict(title = 'df_ORGANIZATION_TYPE', showticklabels=True),
yaxis = dict(title = 'Record Count'),
hovermode = 'closest',width=600
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='df_ORGANIZATION_TYPE')
np.where(pd.isnull(df_init['ORGANIZATION_TYPE']))
df_init['ORGANIZATION_TYPE'].unique()
Change the organization type to reduce number of distinct values as per below table:
Original Value New Value
Business Entity Type 1 Business Business Entity Type 2 Business Business Entity Type 3 Business Business Entity Type 3 Business Industry: type 1 Industry Industry: type 10 Industry Industry: type 11 Industry Industry: type 12 Industry Industry: type 13 Industry Industry: type 2 Industry Industry: type 3 Industry Industry: type 4 Industry Industry: type 5 Industry Industry: type 6 Industry Industry: type 7 Industry Industry: type 8 Industry Industry: type 9 Industry Trade: type 1 Trade Trade: type 2 Trade Trade: type 3 Trade Trade: type 4 Trade Trade: type 5 Trade Trade: type 6 Trade Trade: type 7 Trade Transport: type 1 Transport Transport: type 2 Transport Transport: type 3 Transport Transport: type 4 Transport
df_init.loc[df_init['ORGANIZATION_TYPE'].str.contains('Business'), 'ORGANIZATION_TYPE'] = 'Business'
df_init.loc[df_init['ORGANIZATION_TYPE'].str.contains('Industry'), 'ORGANIZATION_TYPE'] = 'Industry'
df_init.loc[df_init['ORGANIZATION_TYPE'].str.contains('Trade'), 'ORGANIZATION_TYPE'] = 'Trade'
df_init.loc[df_init['ORGANIZATION_TYPE'].str.contains('Transport'), 'ORGANIZATION_TYPE'] = 'Transport'
df_init['ORGANIZATION_TYPE'].unique()
temp_ORGANIZATION_TYPE = df_init["ORGANIZATION_TYPE"].value_counts()
df_ORGANIZATION_TYPE = pd.DataFrame({'ORGANIZATION_TYPE': temp_ORGANIZATION_TYPE.index,'values': temp_ORGANIZATION_TYPE.values})
trace = go.Bar(
x = df_ORGANIZATION_TYPE['ORGANIZATION_TYPE'],y = df_ORGANIZATION_TYPE['values'],
name="Organization Types",
marker=dict(color="Blue"),
text=df_ORGANIZATION_TYPE['values']
)
data = [trace]
layout = dict(title = 'Organization Types',
xaxis = dict(title = 'df_ORGANIZATION_TYPE', showticklabels=True),
yaxis = dict(title = 'Record Count'),
hovermode = 'closest',width=600
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='df_ORGANIZATION_TYPE')
np.where(pd.isnull(df_init['ORGANIZATION_TYPE']))
#le_ORGANIZATION_TYPE = LabelEncoder()
#df_init['ORGANIZATION_TYPE_ENCODED'] = le_ORGANIZATION_TYPE.fit_transform(df_init.ORGANIZATION_TYPE)
#ORGANIZATION_TYPE_ohe = OneHotEncoder()
#X = ORGANIZATION_TYPE_ohe.fit_transform(df_init.ORGANIZATION_TYPE_ENCODED.values.reshape(-1,1)).toarray()
#dfOneHot = pd.DataFrame(X, columns = ["ORGANIZATION_TYPE_"+str(int(i)) for i in range(X.shape[1])])
#df_init = pd.concat([df_init, dfOneHot], axis=1)
#df_init = df_init.drop(["ORGANIZATION_TYPE_ENCODED", "ORGANIZATION_TYPE"], axis=1)
df_init['ORGANIZATION_TYPE'] = df_init['ORGANIZATION_TYPE'].astype('category')
temp_NAME_INCOME_TYPE = df_init["NAME_INCOME_TYPE"].value_counts()
df_NAME_INCOME_TYPE = pd.DataFrame({'NAME_INCOME_TYPE': temp_NAME_INCOME_TYPE.index,'values': temp_NAME_INCOME_TYPE.values})
trace = go.Bar(
x = df_NAME_INCOME_TYPE['NAME_INCOME_TYPE'],y = df_NAME_INCOME_TYPE['values'],
name="Income Types",
marker=dict(color="Blue"),
text=df_NAME_INCOME_TYPE['values']
)
data = [trace]
layout = dict(title = 'Income Types',
xaxis = dict(title = 'df_NAME_INCOME_TYPE', showticklabels=True),
yaxis = dict(title = 'Record Count'),
hovermode = 'closest',width=600
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='df_NAME_INCOME_TYPE')
np.where(pd.isnull(df_init['NAME_INCOME_TYPE']))
df_init['NAME_INCOME_TYPE'] = df_init['NAME_INCOME_TYPE'].astype('category')
temp_NAME_EDUCATION_TYPE = df_init["NAME_EDUCATION_TYPE"].value_counts()
df_NAME_EDUCATION_TYPE = pd.DataFrame({'NAME_EDUCATION_TYPE': temp_NAME_EDUCATION_TYPE.index,'values': temp_NAME_EDUCATION_TYPE.values})
trace = go.Bar(
x = df_NAME_EDUCATION_TYPE['NAME_EDUCATION_TYPE'],y = df_NAME_EDUCATION_TYPE['values'],
name="Education Types",
marker=dict(color="Blue"),
text=df_NAME_EDUCATION_TYPE['values']
)
data = [trace]
layout = dict(title = 'Education Types',
xaxis = dict(title = 'df_NAME_EDUCATION_TYPE', showticklabels=True),
yaxis = dict(title = 'Record Count'),
hovermode = 'closest',width=600
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='df_NAME_EDUCATION_TYPE')
np.where(pd.isnull(df_init['NAME_EDUCATION_TYPE']))
df_init['NAME_EDUCATION_TYPE'] = df_init['NAME_EDUCATION_TYPE'].astype('category')
temp_NAME_FAMILY_STATUS = df_init["NAME_FAMILY_STATUS"].value_counts()
df_NAME_FAMILY_STATUS = pd.DataFrame({'NAME_FAMILY_STATUS': temp_NAME_FAMILY_STATUS.index,'values': temp_NAME_FAMILY_STATUS.values})
trace = go.Bar(
x = df_NAME_FAMILY_STATUS['NAME_FAMILY_STATUS'],y = df_NAME_FAMILY_STATUS['values'],
name="Family Status Types",
marker=dict(color="Blue"),
text=df_NAME_FAMILY_STATUS['values']
)
data = [trace]
layout = dict(title = 'Family Status Types',
xaxis = dict(title = 'df_NAME_FAMILY_STATUS', showticklabels=True),
yaxis = dict(title = 'Record Count'),
hovermode = 'closest',width=600
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='df_NAME_FAMILY_STATUS')
np.where(pd.isnull(df_init['NAME_FAMILY_STATUS']))
df_init['NAME_FAMILY_STATUS'] = df_init['NAME_FAMILY_STATUS'].astype('category')
df_init['NAME_FAMILY_STATUS']
temp_NAME_HOUSING_TYPE = df_init["NAME_HOUSING_TYPE"].value_counts()
df_NAME_HOUSING_TYPE = pd.DataFrame({'NAME_HOUSING_TYPE': temp_NAME_HOUSING_TYPE.index,'values': temp_NAME_HOUSING_TYPE.values})
trace = go.Bar(
x = df_NAME_HOUSING_TYPE['NAME_HOUSING_TYPE'],y = df_NAME_HOUSING_TYPE['values'],
name="Housing Types",
marker=dict(color="Blue"),
text=df_NAME_HOUSING_TYPE['values']
)
data = [trace]
layout = dict(title = 'Housing Types',
xaxis = dict(title = 'df_NAME_HOUSING_TYPE', showticklabels=True),
yaxis = dict(title = 'Record Count'),
hovermode = 'closest',width=600
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='df_NAME_HOUSING_TYPE')
temp_NAME_TYPE_SUITE = df_init["NAME_TYPE_SUITE"].value_counts()
df_NAME_TYPE_SUITE = pd.DataFrame({'NAME_TYPE_SUITE': temp_NAME_TYPE_SUITE.index,'values': temp_NAME_TYPE_SUITE.values})
trace = go.Bar(
x = df_NAME_TYPE_SUITE['NAME_TYPE_SUITE'],y = df_NAME_TYPE_SUITE['values'],
name="Suite Types",
marker=dict(color="Blue"),
text=df_NAME_TYPE_SUITE['values']
)
data = [trace]
layout = dict(title = 'Suite Types',
xaxis = dict(title = 'df_NAME_TYPE_SUITE', showticklabels=True),
yaxis = dict(title = 'Record Count'),
hovermode = 'closest',width=600
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='df_NAME_TYPE_SUITE')
temp_OCCUPATION_TYPE = df_init["OCCUPATION_TYPE"].value_counts()
df_OCCUPATION_TYPE = pd.DataFrame({'OCCUPATION_TYPE': temp_OCCUPATION_TYPE.index,'values': temp_OCCUPATION_TYPE.values})
trace = go.Bar(
x = df_OCCUPATION_TYPE['OCCUPATION_TYPE'],y = df_OCCUPATION_TYPE['values'],
name="Occupation Types",
marker=dict(color="Blue"),
text=df_OCCUPATION_TYPE['values']
)
data = [trace]
layout = dict(title = 'Occupation Types',
xaxis = dict(title = 'df_OCCUPATION_TYPE', showticklabels=True),
yaxis = dict(title = 'Record Count'),
hovermode = 'closest',width=600
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='df_OCCUPATION_TYPE')
np.where(pd.isnull(df_init['OCCUPATION_TYPE']))
df_init['OCCUPATION_TYPE'] = df_init.OCCUPATION_TYPE.fillna('UNKNOWN')
temp_OCCUPATION_TYPE = df_init["OCCUPATION_TYPE"].value_counts()
df_OCCUPATION_TYPE = pd.DataFrame({'OCCUPATION_TYPE': temp_OCCUPATION_TYPE.index,'values': temp_OCCUPATION_TYPE.values})
trace = go.Bar(
x = df_OCCUPATION_TYPE['OCCUPATION_TYPE'],y = df_OCCUPATION_TYPE['values'],
name="Occupation Types",
marker=dict(color="Blue"),
text=df_OCCUPATION_TYPE['values']
)
data = [trace]
layout = dict(title = 'Occupation Types',
xaxis = dict(title = 'df_OCCUPATION_TYPE', showticklabels=True),
yaxis = dict(title = 'Record Count'),
hovermode = 'closest',width=600
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='df_OCCUPATION_TYPE')
df_init['OCCUPATION_TYPE'] = df_init['OCCUPATION_TYPE'].astype('category')
temp_CODE_GENDER = df_init["CODE_GENDER"].value_counts()
df_CODE_GENDER = pd.DataFrame({'CODE_GENDER': temp_CODE_GENDER.index,'values': temp_CODE_GENDER.values})
trace = go.Bar(
x = df_CODE_GENDER['CODE_GENDER'],y = df_CODE_GENDER['values'],
name="Genders",
marker=dict(color="Blue"),
text=df_CODE_GENDER['values']
)
data = [trace]
layout = dict(title = 'Genders',
xaxis = dict(title = 'df_CODE_GENDER', showticklabels=True),
yaxis = dict(title = 'Record Count'),
hovermode = 'closest',width=600
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='df_CODE_GENDER')
df_gender_count = df_init.groupby('CODE_GENDER')['SK_ID_CURR'].nunique()
df_gender_count
np.where(pd.isnull(df_init['CODE_GENDER']))
df_init.loc[df_init['CODE_GENDER'].str.contains('XNA'), 'CODE_GENDER'] = 'M'
df_gender_count = df_init.groupby('CODE_GENDER')['SK_ID_CURR'].nunique()
df_gender_count
df_init['CODE_GENDER'] = np.where(df_init['CODE_GENDER'] == "M", True, False)
temp_FLAG_OWN_CAR = df_init["FLAG_OWN_CAR"].value_counts()
df_FLAG_OWN_CAR = pd.DataFrame({'FLAG_OWN_CAR': temp_FLAG_OWN_CAR.index,'values': temp_FLAG_OWN_CAR.values})
trace = go.Bar(
x = df_FLAG_OWN_CAR['FLAG_OWN_CAR'],y = df_FLAG_OWN_CAR['values'],
name="Flag Own Car",
marker=dict(color="Blue"),
text=df_FLAG_OWN_CAR['values']
)
data = [trace]
layout = dict(title = 'Flag Own Car',
xaxis = dict(title = 'df_FLAG_OWN_CAR', showticklabels=True),
yaxis = dict(title = 'Record Count'),
hovermode = 'closest',width=600
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='df_FLAG_OWN_CAR')
df_init['FLAG_OWN_CAR'] = np.where(df_init['FLAG_OWN_CAR'] == "Y", True, False)
temp_FLAG_OWN_REALTY = df_init["FLAG_OWN_REALTY"].value_counts()
df_FLAG_OWN_REALTY = pd.DataFrame({'FLAG_OWN_REALTY': temp_FLAG_OWN_REALTY.index,'values': temp_FLAG_OWN_REALTY.values})
trace = go.Bar(
x = df_FLAG_OWN_REALTY['FLAG_OWN_REALTY'],y = df_FLAG_OWN_REALTY['values'],
name="Flag Own Realty",
marker=dict(color="Blue"),
text=df_FLAG_OWN_REALTY['values']
)
data = [trace]
layout = dict(title = 'Flag Own Realty',
xaxis = dict(title = 'df_FLAG_OWN_REALTY', showticklabels=True),
yaxis = dict(title = 'Record Count'),
hovermode = 'closest',width=600
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='df_FLAG_OWN_REALTY')
df_init['FLAG_OWN_REALTY'] = np.where(df_init['FLAG_OWN_REALTY'] == "Y", True, False)
df_init['REG_CITY_NOT_LIVE_CITY'] = np.where(df_init['REG_CITY_NOT_LIVE_CITY'] == 1, True, False)
df_init['REG_CITY_NOT_WORK_CITY'] = np.where(df_init['REG_CITY_NOT_WORK_CITY'] == 1, True, False)
df_init.dtypes
for col in df_init.dtypes[df_init.dtypes == 'object'].index:
for_dummy = df_init.pop(col)
df_init = pd.concat([df_init, pd.get_dummies(for_dummy, prefix=col)], axis=1)
df_init.dtypes
mpl.rcParams['interactive'] == True
#Using Pearson Correlation
plt.figure(figsize=(24,20))
cor = df_init.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.savefig('Correlation_Pearson')
plt.show()
# Find correlations with the target and sort
correlations = df_init.corr()['TARGET'].sort_values()
# Display correlations
print('Most Positive Correlations:\n', correlations.tail(15))
print('\nMost Negative Correlations:\n', correlations.head(15))
df_bureau = pd.read_csv(r"C:\Users\mamta\MMAI 2020\MMAI823_AI in Finance\Team Assignments\Team Project\Home Credit Risk Dataset\bureau.csv",sep=',')
list(df_bureau)
df_bureau.describe().transpose()
df_bureau.head()
credit_type_count = df_bureau.groupby('CREDIT_TYPE')['SK_ID_CURR'].nunique()
credit_type_count = credit_type_count.sort_values(ascending=False)
credit_type_count.plot(kind='bar')
credit_type_count
m = df_bureau.CREDIT_TYPE.isin(credit_type_count.index[credit_type_count<900])
df_bureau.loc[m, 'CREDIT_TYPE'] = 'Other type of loan'
credit_currency_count = df_bureau.groupby('CREDIT_CURRENCY')['SK_ID_CURR'].nunique()
credit_currency_count = credit_currency_count.sort_values(ascending=False)
credit_currency_count.plot(kind='bar')
credit_currency_count
df_bureau = df_bureau.drop(["CREDIT_CURRENCY"], axis=1)
# Visualize credit_active
credit_active_count = df_bureau.groupby('CREDIT_ACTIVE')['SK_ID_CURR'].nunique()
credit_active_count = credit_active_count.sort_values(ascending=False)
credit_active_count.plot(kind='bar')
credit_active_count
categorical = pd.get_dummies(df_bureau.select_dtypes('object'))
categorical['SK_ID_CURR'] = df_bureau['SK_ID_CURR']
categorical.head()
categorical_grouped = categorical.groupby('SK_ID_CURR').agg(['sum', 'mean'])
categorical_grouped.head()
# Merge the 'categorical_grouped' column with application_train file using the 'SK_ID_CURR' column as key
# Add column names
group_var = 'SK_ID_CURR'
# Need to create new column names
columns = []
# Iterate through the variables names
for var in categorical_grouped.columns.levels[0]:
# Skip the grouping variable
if var != group_var:
# Iterate through the stat names
for stat in ['count', 'count_norm']:
# Make a new column name for the variable and stat
columns.append('%s_%s' % (var, stat))
# Rename the columns
categorical_grouped.columns = columns
categorical_grouped.head()
# Merge the 'previous_loan_counts' column with application_train table using 'SK_ID_CURR'
previous_loan_counts = df_bureau.groupby('SK_ID_CURR', as_index=False)['SK_ID_BUREAU'].count().rename(columns = {'SK_ID_BUREAU': 'previous_loan_counts'})
previous_loan_counts.head()
df_bureau['YEARS_CREDIT'] = round((- (df_bureau['DAYS_CREDIT'] / 365) ), 2)
df_bureau['CREDIT_YEAR_OVERDUE'] = round((- (df_bureau['CREDIT_DAY_OVERDUE'] / 365) ),2)
df_bureau = df_bureau.drop(["DAYS_CREDIT", "CREDIT_DAY_OVERDUE", "DAYS_CREDIT_ENDDATE", "DAYS_ENDDATE_FACT", "CNT_CREDIT_PROLONG", "DAYS_CREDIT_UPDATE"], axis=1)
bureau_agg = df_bureau.drop(columns = ['SK_ID_BUREAU']).groupby('SK_ID_CURR', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()
bureau_agg.head()
# Assign column names. Merge 'bureau_agg' with 'application_train' using the 'SK_ID_CURR' column as key
columns = ['SK_ID_CURR']
# Iterate through the variables names
for var in bureau_agg.columns.levels[0]:
# Skip the id name
if var != 'SK_ID_CURR':
# Iterate through the stat names
for stat in bureau_agg.columns.levels[1][:-1]:
# Make a new column name for the variable and stat
columns.append('bureau_%s_%s' % (var, stat))
bureau_agg.columns = columns
bureau_agg.head()
Merging below mentioned aggregate tables (derived from BUREAU.csv) to the df_init dataset (derived from application_train.csv) 1) bureau_agg 2) previous_loan_counts 3) categorical_grouped
categorical_grouped.info()
# Join to the df_init dataframe with categorical_grouped
df_application_merged = df_init.merge(categorical_grouped, on = 'SK_ID_CURR', how = 'left')
df_application_merged.head()
# Join to the df_application_merged dataframe with previous_loan_counts
df_application_merged = df_application_merged.merge(previous_loan_counts, on = 'SK_ID_CURR', how = 'left')
df_application_merged.head()
# Join to the df_application_merged dataframe with bureau_agg
df_application_merged = df_application_merged.merge(bureau_agg, on = 'SK_ID_CURR', how = 'left')
df_application_merged.head()
df_application_merged.info()
mpl.rcParams['interactive'] == True
#Using Pearson Correlation
plt.figure(figsize=(24,20))
cor = df_application_merged.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.savefig('Correlation_Pearson')
plt.show()
# Find correlations with the target and sort
correlations = df_application_merged.corr()['TARGET'].sort_values()
# Display correlations
print('Most Positive Correlations:\n', correlations.tail(15))
print('\nMost Negative Correlations:\n', correlations.head(15))
clist = list(df_application_merged.columns)
clist_new = clist[:1]+clist[2:]+clist[1:2]
#clist
# Pass the new list to the DataFrame - like a key list in a dict
df_application_merged_1 = df_application_merged[clist_new]
df_application_merged_1
df_features_select = df_application_merged_1[["SK_ID_CURR","NAME_INCOME_TYPE",
"ORGANIZATION_TYPE",
"NAME_FAMILY_STATUS",
"NAME_EDUCATION_TYPE",
"CREDIT_TYPE_Credit card_count_norm",
"CREDIT_TYPE_Credit card_count",
"OWN_CAR_AGE",
"OCCUPATION_TYPE",
"REG_CITY_NOT_LIVE_CITY",
"CREDIT_TYPE_Microloan_count_norm",
"DAYS_EMPLOYED_YEAR",
"FLAG_EMP_PHONE",
"REGION_RATING_CLIENT_2",
"REGION_RATING_CLIENT_W_CITY_2",
"REG_CITY_NOT_WORK_CITY",
"CODE_GENDER",
"DAYS_LAST_PHONE_CHANGE",
"CREDIT_ACTIVE_Active_count",
"CREDIT_ACTIVE_Active_count_norm",
"EXT_SOURCE_3",
"EXT_SOURCE_2",
"EXT_SOURCE_1",
"bureau_YEARS_CREDIT_mean",
"CREDIT_ACTIVE_Closed_count_norm",
"DAYS_BIRTH_YEAR",
"bureau_YEARS_CREDIT_max",
"DAYS_ID_PUBLISH_YEAR",
"bureau_YEARS_CREDIT_min",
"FLOORSMAX_AVG",
"FLOORSMAX_MEDI",
"FLOORSMAX_MODE",
"EMERGENCYSTATE_MODE_No",
"bureau_YEARS_CREDIT_sum",
"DAYS_REGISTRATION_YEAR",
"TARGET"]]
np.where(pd.isnull(df_features_select))
def missing_zero_values_table(df_features_select):
zero_val = (df_features_select == 0.00).astype(int).sum(axis=0)
mis_val = df_features_select.isnull().sum()
mis_val_percent = 100 * df_features_select.isnull().sum() / len(df_features_select)
mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)
mz_table = mz_table.rename(
columns = {0 : 'Zero Values', 1 : 'Missing Values', 2 : '% of Total Values'})
mz_table['Total Zero Missing Values'] = mz_table['Zero Values'] + mz_table['Missing Values']
mz_table['% Total Zero Missing Values'] = 100 * mz_table['Total Zero Missing Values'] / len(df_features_select)
mz_table['Data Type'] = df_features_select.dtypes
mz_table = mz_table[
mz_table.iloc[:,1] != 0].sort_values(
'% of Total Values', ascending=False).round(1)
print ("Your selected dataframe has " + str(df_features_select.shape[1]) + " columns and " + str(df_features_select.shape[0]) + " Rows.\n"
"There are " + str(mz_table.shape[0]) +
" columns that have missing values.")
# mz_table.to_excel('missing_and_zero_values.xlsx', freeze_panes=(1,0), index = False)
return mz_table
missing_zero_values_table(df_features_select)
mpl.rcParams['interactive'] == True
#Using Pearson Correlation
plt.figure(figsize=(24,20))
cor = df_features_select.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.savefig('Correlation_Pearson')
plt.show()
df_features_select = df_application_merged_1[["SK_ID_CURR",
"NAME_INCOME_TYPE",
"ORGANIZATION_TYPE",
"NAME_FAMILY_STATUS",
"NAME_EDUCATION_TYPE",
"CREDIT_TYPE_Credit card_count",
"OWN_CAR_AGE",
"OCCUPATION_TYPE",
"REG_CITY_NOT_LIVE_CITY",
"CREDIT_TYPE_Microloan_count_norm",
"DAYS_EMPLOYED_YEAR",
"REGION_RATING_CLIENT_2",
"REG_CITY_NOT_WORK_CITY",
"CODE_GENDER",
"DAYS_LAST_PHONE_CHANGE",
"CREDIT_ACTIVE_Active_count",
"EXT_SOURCE_3",
"EXT_SOURCE_2",
"EXT_SOURCE_1",
"bureau_YEARS_CREDIT_mean",
"DAYS_BIRTH_YEAR",
"DAYS_ID_PUBLISH_YEAR",
"FLOORSMAX_AVG",
"EMERGENCYSTATE_MODE_No",
"bureau_YEARS_CREDIT_sum",
"DAYS_REGISTRATION_YEAR",
"TARGET"]]
df_features_select.isnull().sum()
df_features_select.head()
df_features_select.dtypes
df_region_rating = df_features_select.groupby('REGION_RATING_CLIENT_2')['SK_ID_CURR'].nunique()
df_region_rating
df_features_select['REGION_RATING_CLIENT_2'] = np.where(df_features_select['REGION_RATING_CLIENT_2'] == 1, True, False)
print('Reading the data....', end='')
previous_applicaton = pd.read_csv(r'C:\Users\mamta\MMAI 2020\MMAI823_AI in Finance\Team Assignments\Team Project\Home Credit Risk Dataset\previous_application.csv',sep=',')
print('done!!!')
print('The shape of data:',previous_applicaton.shape)
print('First 5 rows of data:')
previous_applicaton.head()
list(previous_applicaton.columns)
previous_applicaton.drop(['WEEKDAY_APPR_PROCESS_START',
'HOUR_APPR_PROCESS_START','NFLAG_LAST_APPL_IN_DAY','RATE_DOWN_PAYMENT',
'RATE_INTEREST_PRIMARY',
'RATE_INTEREST_PRIVILEGED',
'NAME_CASH_LOAN_PURPOSE',
'NAME_CONTRACT_STATUS',
'DAYS_DECISION',
'NAME_PAYMENT_TYPE','NAME_TYPE_SUITE',
'NAME_CLIENT_TYPE',
'NAME_GOODS_CATEGORY',
'NAME_PORTFOLIO',
'NAME_PRODUCT_TYPE',
'CHANNEL_TYPE',
'SELLERPLACE_AREA',
'NAME_SELLER_INDUSTRY','NAME_YIELD_GROUP',
'PRODUCT_COMBINATION',
'DAYS_FIRST_DRAWING',
'DAYS_FIRST_DUE',
'DAYS_LAST_DUE_1ST_VERSION',
'DAYS_LAST_DUE',
'DAYS_TERMINATION',
'NFLAG_INSURED_ON_APPROVAL'], axis=1, inplace=True)
previous_applicaton.dtypes
previous_applicaton.NAME_CONTRACT_TYPE.unique()
previous_applicaton['NAME_CONTRACT_TYPE'] = previous_applicaton['NAME_CONTRACT_TYPE'].astype('category')
previous_applicaton.FLAG_LAST_APPL_PER_CONTRACT.unique()
previous_applicaton['FLAG_LAST_APPL_PER_CONTRACT'] = previous_applicaton['FLAG_LAST_APPL_PER_CONTRACT'].astype('category')
previous_applicaton.CODE_REJECT_REASON.unique()
previous_applicaton['CODE_REJECT_REASON'] = previous_applicaton['CODE_REJECT_REASON'].astype('category')
# Number of previous applications per customer
grp = previous_applicaton[['SK_ID_CURR','SK_ID_PREV']].groupby(by=['SK_ID_CURR'])['SK_ID_PREV'].count().reset_index().rename(columns={'SK_ID_PREV':'PREV_APP_COUNT'})
application_bureau_prev = df_features_select.merge(grp, on =['SK_ID_CURR'], how = 'left')
application_bureau_prev['PREV_APP_COUNT'] = application_bureau_prev['PREV_APP_COUNT'].fillna(0)
# Combining numerical features
grp = previous_applicaton.drop('SK_ID_PREV', axis =1).groupby(by=['SK_ID_CURR']).mean().reset_index()
prev_columns = ['PREV_'+column if column != 'SK_ID_CURR' else column for column in grp.columns ]
grp.columns = prev_columns
application_bureau_prev = application_bureau_prev.merge(grp, on =['SK_ID_CURR'], how = 'left')
application_bureau_prev.update(application_bureau_prev[grp.columns].fillna(0))
# Combining categorical features
prev_categorical = pd.get_dummies(previous_applicaton.select_dtypes('category'))
prev_categorical['SK_ID_CURR'] = previous_applicaton['SK_ID_CURR']
prev_categorical.head()
grp = prev_categorical.groupby('SK_ID_CURR').max().reset_index()
grp.columns = ['PREV_'+column if column != 'SK_ID_CURR' else column for column in grp.columns]
application_bureau_prev = application_bureau_prev.merge(grp, on=['SK_ID_CURR'], how='left')
application_bureau_prev.update(application_bureau_prev[grp.columns].fillna(0))
print('Reading the data....', end='')
pos_cash = pd.read_csv(r'C:\Users\mamta\MMAI 2020\MMAI823_AI in Finance\Team Assignments\Team Project\Home Credit Risk Dataset\POS_CASH_balance.csv',sep=',')
print('done!!!')
print('The shape of data:',pos_cash.shape)
print('First 5 rows of data:')
pos_cash.head()
pos_cash.drop(['SK_DPD','SK_DPD_DEF'], axis=1, inplace=True)
# Combining numerical features
grp = pos_cash.drop('SK_ID_PREV', axis =1).groupby(by=['SK_ID_CURR']).mean().reset_index()
prev_columns = ['POS_'+column if column != 'SK_ID_CURR' else column for column in grp.columns ]
grp.columns = prev_columns
application_bureau_prev = application_bureau_prev.merge(grp, on =['SK_ID_CURR'], how = 'left')
application_bureau_prev.update(application_bureau_prev[grp.columns].fillna(0))
# Combining categorical features
pos_cash_categorical = pd.get_dummies(pos_cash.select_dtypes('object'))
pos_cash_categorical['SK_ID_CURR'] = pos_cash['SK_ID_CURR']
grp = pos_cash_categorical.groupby('SK_ID_CURR').mean().reset_index()
grp.columns = ['POS_'+column if column != 'SK_ID_CURR' else column for column in grp.columns]
application_bureau_prev = application_bureau_prev.merge(grp, on=['SK_ID_CURR'], how='left')
application_bureau_prev.update(application_bureau_prev[grp.columns].fillna(0))
print('Reading the data....', end='')
insta_payments = pd.read_csv(r'C:\Users\mamta\MMAI 2020\MMAI823_AI in Finance\Team Assignments\Team Project\Home Credit Risk Dataset\installments_payments.csv',sep=',')
print('done!!!')
print('The shape of data:',insta_payments.shape)
print('First 5 rows of data:')
insta_payments.head()
insta_payments.drop(['DAYS_INSTALMENT','DAYS_ENTRY_PAYMENT'], axis=1, inplace=True)
# Combining numerical features and there are no categorical features in this dataset
grp = insta_payments.drop('SK_ID_PREV', axis =1).groupby(by=['SK_ID_CURR']).mean().reset_index()
prev_columns = ['INSTA_'+column if column != 'SK_ID_CURR' else column for column in grp.columns ]
grp.columns = prev_columns
application_bureau_prev = application_bureau_prev.merge(grp, on =['SK_ID_CURR'], how = 'left')
application_bureau_prev.update(application_bureau_prev[grp.columns].fillna(0))
print('Reading the data....', end='')
credit_card = pd.read_csv(r'C:\Users\mamta\MMAI 2020\MMAI823_AI in Finance\Team Assignments\Team Project\Home Credit Risk Dataset\credit_card_balance.csv')
print('done!!!')
print('The shape of data:',credit_card.shape)
print('First 5 rows of data:')
credit_card.head()
credit_card.dtypes
credit_card['YEARS_Balance'] = round((- (credit_card['MONTHS_BALANCE'] / 12) ), 2)
credit_card.drop(['CNT_DRAWINGS_ATM_CURRENT','CNT_DRAWINGS_CURRENT','CNT_DRAWINGS_OTHER_CURRENT','CNT_DRAWINGS_POS_CURRENT',
'CNT_INSTALMENT_MATURE_CUM','SK_DPD','SK_DPD_DEF','MONTHS_BALANCE'], axis=1, inplace=True)
# Combining numerical features
grp = credit_card.drop('SK_ID_PREV', axis =1).groupby(by=['SK_ID_CURR']).mean().reset_index()
prev_columns = ['CREDIT_'+column if column != 'SK_ID_CURR' else column for column in grp.columns ]
grp.columns = prev_columns
application_bureau_prev = application_bureau_prev.merge(grp, on =['SK_ID_CURR'], how = 'left')
application_bureau_prev.update(application_bureau_prev[grp.columns].fillna(0))
# Combining categorical features
credit_categorical = pd.get_dummies(credit_card.select_dtypes('object'))
credit_categorical['SK_ID_CURR'] = credit_card['SK_ID_CURR']
grp = credit_categorical.groupby('SK_ID_CURR').mean().reset_index()
grp.columns = ['CREDIT_'+column if column != 'SK_ID_CURR' else column for column in grp.columns]
application_bureau_prev = application_bureau_prev.merge(grp, on=['SK_ID_CURR'], how='left')
application_bureau_prev.update(application_bureau_prev[grp.columns].fillna(0))
mpl.rcParams['interactive'] == True
#Using Pearson Correlation
plt.figure(figsize=(24,20))
cor = application_bureau_prev.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.savefig('Correlation_Pearson')
plt.show()
# Find correlations with the target and sort
correlations = application_bureau_prev.corr()['TARGET'].sort_values()
# Display correlations
print('Most Positive Correlations:\n', correlations.tail(20))
print('\nMost Negative Correlations:\n', correlations.head(20))
application_bureau_prev.info()
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy="mean")
application_bureau_prev["EXT_SOURCE_3"] = imp.fit_transform(application_bureau_prev[["EXT_SOURCE_3"]]).ravel()
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy="mean")
application_bureau_prev["EXT_SOURCE_2"] = imp.fit_transform(application_bureau_prev[["EXT_SOURCE_2"]]).ravel()
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy="mean")
application_bureau_prev["EXT_SOURCE_1"] = imp.fit_transform(application_bureau_prev[["EXT_SOURCE_1"]]).ravel()
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy="median")
application_bureau_prev["DAYS_LAST_PHONE_CHANGE"] = imp.fit_transform(application_bureau_prev[["DAYS_LAST_PHONE_CHANGE"]]).ravel()
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy="most_frequent")
application_bureau_prev["OWN_CAR_AGE"] = imp.fit_transform(application_bureau_prev[["OWN_CAR_AGE"]]).ravel()
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy="most_frequent")
application_bureau_prev["FLOORSMAX_AVG"] = imp.fit_transform(application_bureau_prev[["FLOORSMAX_AVG"]]).ravel()
application_bureau_prev["CREDIT_TYPE_Credit card_count"] .fillna(0, inplace=True)
application_bureau_prev["CREDIT_TYPE_Microloan_count_norm"] .fillna(0, inplace=True)
application_bureau_prev["bureau_YEARS_CREDIT_mean"] .fillna(0, inplace=True)
application_bureau_prev["bureau_YEARS_CREDIT_sum"] .fillna(0, inplace=True)
application_bureau_prev["CREDIT_ACTIVE_Active_count"] .fillna(0, inplace=True)
application_bureau_prev.isnull().sum()
num_cols = application_bureau_prev.select_dtypes([np.number]).columns
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
application_bureau_prev_num = application_bureau_prev[num_cols]
application_bureau_prev_num = scaler.fit_transform(application_bureau_prev_num)
application_bureau_prev_num_final = pd.DataFrame(application_bureau_prev_num, columns=num_cols)
types = np.array([dt for dt in application_bureau_prev.dtypes])
all_columns = application_bureau_prev.columns.values
cat_cols = [x for x in all_columns if x not in num_cols]
cat_cols
# Featurization of categorical data
imputer_cat = SimpleImputer(strategy='constant', fill_value='MISSING')
application_bureau_prev_cat = imputer_cat.fit_transform(application_bureau_prev[cat_cols])
application_bureau_prev_cat1= pd.DataFrame(application_bureau_prev_cat, columns=cat_cols)
ohe = OneHotEncoder(sparse=False,handle_unknown='ignore')
application_bureau_prev_cat2 = ohe.fit_transform(application_bureau_prev_cat1)
cat_cols_ohe = list(ohe.get_feature_names(input_features=cat_cols))
application_bureau_prev_cat_final = pd.DataFrame(application_bureau_prev_cat2, columns = cat_cols_ohe)
# Final complete data
application_bureau_prev_final = pd.concat([application_bureau_prev_num_final,application_bureau_prev_cat_final], axis = 1)
print(application_bureau_prev_final.shape)
from sklearn.model_selection import train_test_split
y = application_bureau_prev_final.pop('TARGET').values
X_train, X_test, y_train, y_test = train_test_split(application_bureau_prev_final.drop(['SK_ID_CURR'],axis=1), y, stratify = y, test_size=0.3, random_state=42)
print('Shape of X_train:',X_train.shape)
print('Shape of X_test:',X_test.shape)
count = X_train.isnull().sum().sort_values(ascending=False)
percentage = ((X_train.isnull().sum()/len(X_train)*100)).sort_values(ascending=False)
missing_X_train = pd.concat([count, percentage], axis=1, keys=['Count','Percentage'])
print('Count and percentage of missing values for top 20 columns:')
missing_X_train.head(20)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X_train,y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score'] #naming the dataframe columns
print(featureScores.nlargest(50,'Score')) #print 10 best features
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
plt.figure(figsize=(20,20))
model = ExtraTreesClassifier()
model.fit(X_train,y_train)
#print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X_train.columns)
feat_importances.nlargest(50).plot(kind='barh')
plt.show()
list(X_train.columns)
X_train = X_train.loc[:, ['CREDIT_TYPE_Credit card_count',
'OWN_CAR_AGE',
'CREDIT_TYPE_Microloan_count_norm',
'DAYS_EMPLOYED_YEAR',
'DAYS_LAST_PHONE_CHANGE',
'CREDIT_ACTIVE_Active_count',
'EXT_SOURCE_3',
'EXT_SOURCE_2',
'EXT_SOURCE_1',
'bureau_YEARS_CREDIT_mean',
'DAYS_BIRTH_YEAR',
'DAYS_ID_PUBLISH_YEAR',
'FLOORSMAX_AVG',
'EMERGENCYSTATE_MODE_No',
'bureau_YEARS_CREDIT_sum',
'DAYS_REGISTRATION_YEAR',
'PREV_APP_COUNT',
'PREV_AMT_ANNUITY',
'PREV_AMT_APPLICATION',
'PREV_AMT_CREDIT',
'PREV_AMT_DOWN_PAYMENT',
'PREV_AMT_GOODS_PRICE',
'PREV_CNT_PAYMENT',
'PREV_NAME_CONTRACT_TYPE_Cash loans',
'PREV_NAME_CONTRACT_TYPE_Consumer loans',
'PREV_NAME_CONTRACT_TYPE_Revolving loans',
'PREV_NAME_CONTRACT_TYPE_XNA',
'PREV_CODE_REJECT_REASON_CLIENT',
'PREV_CODE_REJECT_REASON_HC',
'PREV_CODE_REJECT_REASON_LIMIT',
'PREV_CODE_REJECT_REASON_SCO',
'PREV_CODE_REJECT_REASON_SCOFR',
'PREV_CODE_REJECT_REASON_SYSTEM',
'PREV_CODE_REJECT_REASON_VERIF',
'PREV_CODE_REJECT_REASON_XAP',
'PREV_CODE_REJECT_REASON_XNA',
'POS_MONTHS_BALANCE',
'POS_CNT_INSTALMENT_FUTURE',
'INSTA_AMT_INSTALMENT',
'INSTA_AMT_PAYMENT',
'CREDIT_AMT_BALANCE',
'CREDIT_AMT_CREDIT_LIMIT_ACTUAL',
'CREDIT_AMT_DRAWINGS_ATM_CURRENT',
'CREDIT_AMT_INST_MIN_REGULARITY',
'CREDIT_AMT_PAYMENT_CURRENT',
'CREDIT_AMT_PAYMENT_TOTAL_CURRENT',
'CREDIT_AMT_RECEIVABLE_PRINCIPAL',
'CREDIT_AMT_RECIVABLE',
'NAME_INCOME_TYPE_Businessman',
'NAME_INCOME_TYPE_Commercial associate',
'NAME_INCOME_TYPE_Maternity leave',
'NAME_INCOME_TYPE_Pensioner',
'NAME_INCOME_TYPE_State servant',
'NAME_INCOME_TYPE_Student',
'NAME_INCOME_TYPE_Unemployed',
'NAME_INCOME_TYPE_Working',
'ORGANIZATION_TYPE_Advertising',
'ORGANIZATION_TYPE_Agriculture',
'ORGANIZATION_TYPE_Bank',
'ORGANIZATION_TYPE_Business',
'ORGANIZATION_TYPE_Cleaning',
'ORGANIZATION_TYPE_Construction',
'ORGANIZATION_TYPE_Culture',
'ORGANIZATION_TYPE_Electricity',
'ORGANIZATION_TYPE_Emergency',
'ORGANIZATION_TYPE_Government',
'ORGANIZATION_TYPE_Hotel',
'ORGANIZATION_TYPE_Housing',
'ORGANIZATION_TYPE_Industry',
'ORGANIZATION_TYPE_Insurance',
'ORGANIZATION_TYPE_Kindergarten',
'ORGANIZATION_TYPE_Legal Services',
'ORGANIZATION_TYPE_Medicine',
'ORGANIZATION_TYPE_Military',
'ORGANIZATION_TYPE_Mobile',
'ORGANIZATION_TYPE_Other',
'ORGANIZATION_TYPE_Police',
'ORGANIZATION_TYPE_Postal',
'ORGANIZATION_TYPE_Realtor',
'ORGANIZATION_TYPE_Religion',
'ORGANIZATION_TYPE_Restaurant',
'ORGANIZATION_TYPE_School',
'ORGANIZATION_TYPE_Security',
'ORGANIZATION_TYPE_Security Ministries',
'ORGANIZATION_TYPE_Self-employed',
'ORGANIZATION_TYPE_Services',
'ORGANIZATION_TYPE_Telecom',
'ORGANIZATION_TYPE_Trade',
'ORGANIZATION_TYPE_Transport',
'ORGANIZATION_TYPE_University',
'ORGANIZATION_TYPE_XNA',
'NAME_FAMILY_STATUS_Civil marriage',
'NAME_FAMILY_STATUS_Married',
'NAME_FAMILY_STATUS_Separated',
'NAME_FAMILY_STATUS_Single / not married',
'NAME_FAMILY_STATUS_Unknown',
'NAME_FAMILY_STATUS_Widow',
'NAME_EDUCATION_TYPE_Academic degree',
'NAME_EDUCATION_TYPE_Higher education',
'NAME_EDUCATION_TYPE_Incomplete higher',
'NAME_EDUCATION_TYPE_Lower secondary',
'NAME_EDUCATION_TYPE_Secondary / secondary special',
'OCCUPATION_TYPE_Accountants',
'OCCUPATION_TYPE_Cleaning staff',
'OCCUPATION_TYPE_Cooking staff',
'OCCUPATION_TYPE_Core staff',
'OCCUPATION_TYPE_Drivers',
'OCCUPATION_TYPE_HR staff',
'OCCUPATION_TYPE_High skill tech staff',
'OCCUPATION_TYPE_IT staff',
'OCCUPATION_TYPE_Laborers',
'OCCUPATION_TYPE_Low-skill Laborers',
'OCCUPATION_TYPE_Managers',
'OCCUPATION_TYPE_Medicine staff',
'OCCUPATION_TYPE_Private service staff',
'OCCUPATION_TYPE_Realty agents',
'OCCUPATION_TYPE_Sales staff',
'OCCUPATION_TYPE_Secretaries',
'OCCUPATION_TYPE_Security staff',
'OCCUPATION_TYPE_UNKNOWN',
'OCCUPATION_TYPE_Waiters/barmen staff',
'REG_CITY_NOT_WORK_CITY_False',
'REG_CITY_NOT_WORK_CITY_True',
'CODE_GENDER_False',
'CODE_GENDER_True'
]]
X_test = X_test.loc[:, ['CREDIT_TYPE_Credit card_count',
'OWN_CAR_AGE',
'CREDIT_TYPE_Microloan_count_norm',
'DAYS_EMPLOYED_YEAR',
'DAYS_LAST_PHONE_CHANGE',
'CREDIT_ACTIVE_Active_count',
'EXT_SOURCE_3',
'EXT_SOURCE_2',
'EXT_SOURCE_1',
'bureau_YEARS_CREDIT_mean',
'DAYS_BIRTH_YEAR',
'DAYS_ID_PUBLISH_YEAR',
'FLOORSMAX_AVG',
'EMERGENCYSTATE_MODE_No',
'bureau_YEARS_CREDIT_sum',
'DAYS_REGISTRATION_YEAR',
'PREV_APP_COUNT',
'PREV_AMT_ANNUITY',
'PREV_AMT_APPLICATION',
'PREV_AMT_CREDIT',
'PREV_AMT_DOWN_PAYMENT',
'PREV_AMT_GOODS_PRICE',
'PREV_CNT_PAYMENT',
'PREV_NAME_CONTRACT_TYPE_Cash loans',
'PREV_NAME_CONTRACT_TYPE_Consumer loans',
'PREV_NAME_CONTRACT_TYPE_Revolving loans',
'PREV_NAME_CONTRACT_TYPE_XNA',
'PREV_CODE_REJECT_REASON_CLIENT',
'PREV_CODE_REJECT_REASON_HC',
'PREV_CODE_REJECT_REASON_LIMIT',
'PREV_CODE_REJECT_REASON_SCO',
'PREV_CODE_REJECT_REASON_SCOFR',
'PREV_CODE_REJECT_REASON_SYSTEM',
'PREV_CODE_REJECT_REASON_VERIF',
'PREV_CODE_REJECT_REASON_XAP',
'PREV_CODE_REJECT_REASON_XNA',
'POS_MONTHS_BALANCE',
'POS_CNT_INSTALMENT_FUTURE',
'INSTA_AMT_INSTALMENT',
'INSTA_AMT_PAYMENT',
'CREDIT_AMT_BALANCE',
'CREDIT_AMT_CREDIT_LIMIT_ACTUAL',
'CREDIT_AMT_DRAWINGS_ATM_CURRENT',
'CREDIT_AMT_INST_MIN_REGULARITY',
'CREDIT_AMT_PAYMENT_CURRENT',
'CREDIT_AMT_PAYMENT_TOTAL_CURRENT',
'CREDIT_AMT_RECEIVABLE_PRINCIPAL',
'CREDIT_AMT_RECIVABLE',
'NAME_INCOME_TYPE_Businessman',
'NAME_INCOME_TYPE_Commercial associate',
'NAME_INCOME_TYPE_Maternity leave',
'NAME_INCOME_TYPE_Pensioner',
'NAME_INCOME_TYPE_State servant',
'NAME_INCOME_TYPE_Student',
'NAME_INCOME_TYPE_Unemployed',
'NAME_INCOME_TYPE_Working',
'ORGANIZATION_TYPE_Advertising',
'ORGANIZATION_TYPE_Agriculture',
'ORGANIZATION_TYPE_Bank',
'ORGANIZATION_TYPE_Business',
'ORGANIZATION_TYPE_Cleaning',
'ORGANIZATION_TYPE_Construction',
'ORGANIZATION_TYPE_Culture',
'ORGANIZATION_TYPE_Electricity',
'ORGANIZATION_TYPE_Emergency',
'ORGANIZATION_TYPE_Government',
'ORGANIZATION_TYPE_Hotel',
'ORGANIZATION_TYPE_Housing',
'ORGANIZATION_TYPE_Industry',
'ORGANIZATION_TYPE_Insurance',
'ORGANIZATION_TYPE_Kindergarten',
'ORGANIZATION_TYPE_Legal Services',
'ORGANIZATION_TYPE_Medicine',
'ORGANIZATION_TYPE_Military',
'ORGANIZATION_TYPE_Mobile',
'ORGANIZATION_TYPE_Other',
'ORGANIZATION_TYPE_Police',
'ORGANIZATION_TYPE_Postal',
'ORGANIZATION_TYPE_Realtor',
'ORGANIZATION_TYPE_Religion',
'ORGANIZATION_TYPE_Restaurant',
'ORGANIZATION_TYPE_School',
'ORGANIZATION_TYPE_Security',
'ORGANIZATION_TYPE_Security Ministries',
'ORGANIZATION_TYPE_Self-employed',
'ORGANIZATION_TYPE_Services',
'ORGANIZATION_TYPE_Telecom',
'ORGANIZATION_TYPE_Trade',
'ORGANIZATION_TYPE_Transport',
'ORGANIZATION_TYPE_University',
'ORGANIZATION_TYPE_XNA',
'NAME_FAMILY_STATUS_Civil marriage',
'NAME_FAMILY_STATUS_Married',
'NAME_FAMILY_STATUS_Separated',
'NAME_FAMILY_STATUS_Single / not married',
'NAME_FAMILY_STATUS_Unknown',
'NAME_FAMILY_STATUS_Widow',
'NAME_EDUCATION_TYPE_Academic degree',
'NAME_EDUCATION_TYPE_Higher education',
'NAME_EDUCATION_TYPE_Incomplete higher',
'NAME_EDUCATION_TYPE_Lower secondary',
'NAME_EDUCATION_TYPE_Secondary / secondary special',
'OCCUPATION_TYPE_Accountants',
'OCCUPATION_TYPE_Cleaning staff',
'OCCUPATION_TYPE_Cooking staff',
'OCCUPATION_TYPE_Core staff',
'OCCUPATION_TYPE_Drivers',
'OCCUPATION_TYPE_HR staff',
'OCCUPATION_TYPE_High skill tech staff',
'OCCUPATION_TYPE_IT staff',
'OCCUPATION_TYPE_Laborers',
'OCCUPATION_TYPE_Low-skill Laborers',
'OCCUPATION_TYPE_Managers',
'OCCUPATION_TYPE_Medicine staff',
'OCCUPATION_TYPE_Private service staff',
'OCCUPATION_TYPE_Realty agents',
'OCCUPATION_TYPE_Sales staff',
'OCCUPATION_TYPE_Secretaries',
'OCCUPATION_TYPE_Security staff',
'OCCUPATION_TYPE_UNKNOWN',
'OCCUPATION_TYPE_Waiters/barmen staff',
'REG_CITY_NOT_WORK_CITY_False',
'REG_CITY_NOT_WORK_CITY_True',
'CODE_GENDER_False',
'CODE_GENDER_True'
]]
#Saving the Dataframes into CSV files for future use
X_train.to_csv('X_train_redo.csv')
X_test.to_csv('X_test_redo.csv')
# Saving the numpy arrays into text files for future use
np.savetxt('y_train_redo.txt', y_train,fmt='%d')
np.savetxt('y_test_redo.txt', y_test,fmt='%d')